From 672c8342f6fe2a0b65aa3e7c87d4f0b41a9e4454 Mon Sep 17 00:00:00 2001 From: "kaf24@scramble.cl.cam.ac.uk" Date: Wed, 17 Mar 2004 13:34:52 +0000 Subject: [PATCH] bitkeeper revision 1.802.1.1 (405853fcN7rcf_nAOUv8-8C-udNDkw) vnetif.c, Makefile: new file Many files: Relaid out xenolinux drivers. More x86_64 stuff. domain_page.c: Rename: xen/common/domain_page.c -> xen/arch/i386/domain_page.c vbd.c: Rename: xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_vbd.c -> xenolinux-2.4.25-sparse/arch/xeno/drivers/block/vbd.c block.h: Rename: xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.h -> xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.h block.c: Rename: xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.c -> xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.c core.c: Rename: xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/dom0_core.c -> xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/core.c evtchn.c: Rename: xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/xl_evtchn.c -> xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/evtchn.c --- .rootkeys | 14 +- tools/xc/lib/xc_linux_build.c | 38 +- tools/xc/lib/xc_linux_restore.c | 20 +- tools/xc/lib/xc_linux_save.c | 6 +- tools/xc/lib/xc_netbsd_build.c | 38 +- xen/{common => arch/i386}/domain_page.c | 0 xen/arch/i386/mm.c | 4 +- xen/arch/i386/process.c | 4 +- xen/common/dom0_ops.c | 16 +- xen/common/domain.c | 59 +- xen/common/kernel.c | 14 +- xen/common/lib.c | 152 +---- xen/include/asm-i386/config.h | 2 + xen/include/asm-i386/processor.h | 2 +- xen/include/asm-i386/types.h | 2 + xen/include/asm-x86_64/atomic.h | 6 +- xen/include/asm-x86_64/config.h | 225 +++++-- xen/include/asm-x86_64/current.h | 4 +- xen/include/asm-x86_64/desc.h | 2 +- xen/include/asm-x86_64/io.h | 6 +- xen/include/asm-x86_64/ldt.h | 8 +- xen/include/asm-x86_64/page.h | 1 - xen/include/asm-x86_64/pci.h | 52 +- xen/include/asm-x86_64/processor.h | 3 +- xen/include/asm-x86_64/types.h | 2 + xen/include/asm-x86_64/uaccess.h | 10 +- .../hypervisor-ifs/arch-i386/hypervisor-if.h | 13 +- .../arch-x86_64/hypervisor-if.h | 16 +- xen/include/hypervisor-ifs/dom0_ops.h | 5 +- xen/include/xeno/lib.h | 2 +- xen/include/xeno/types.h | 2 - xen/net/dev.c | 2 +- xenolinux-2.4.25-sparse/arch/xeno/Makefile | 15 +- .../arch/xeno/drivers/balloon/Makefile | 2 +- .../arch/xeno/drivers/block/Makefile | 4 +- .../drivers/block/{xl_block.c => block.c} | 8 +- .../drivers/block/{xl_block.h => block.h} | 8 +- .../xeno/drivers/block/{xl_vbd.c => vbd.c} | 4 +- .../arch/xeno/drivers/console/Makefile | 2 +- .../arch/xeno/drivers/dom0/Makefile | 4 +- .../xeno/drivers/dom0/{dom0_core.c => core.c} | 8 +- .../arch/xeno/drivers/evtchn/Makefile | 4 +- .../drivers/evtchn/{xl_evtchn.c => evtchn.c} | 2 +- .../arch/xeno/drivers/network/Makefile | 2 +- .../arch/xeno/drivers/network/network.c | 2 +- .../arch/xeno/drivers/vnetif/Makefile | 3 + .../arch/xeno/drivers/vnetif/vnetif.c | 552 ++++++++++++++++++ 47 files changed, 985 insertions(+), 365 deletions(-) rename xen/{common => arch/i386}/domain_page.c (100%) rename xenolinux-2.4.25-sparse/arch/xeno/drivers/block/{xl_block.c => block.c} (98%) rename xenolinux-2.4.25-sparse/arch/xeno/drivers/block/{xl_block.h => block.h} (94%) rename xenolinux-2.4.25-sparse/arch/xeno/drivers/block/{xl_vbd.c => vbd.c} (99%) rename xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/{dom0_core.c => core.c} (94%) rename xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/{xl_evtchn.c => evtchn.c} (99%) create mode 100644 xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/Makefile create mode 100644 xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/vnetif.c diff --git a/.rootkeys b/.rootkeys index 3e0cb1e20f..bac283ef59 100644 --- a/.rootkeys +++ b/.rootkeys @@ -117,6 +117,7 @@ 3ddb79bcsjinG9k1KcvbVBuas1R2dA xen/arch/i386/apic.c 3ddb79bcSC_LvnmFlX-T5iTgaR0SKg xen/arch/i386/boot/boot.S 3ddb79bcUrk2EIaM5VsT6wUudH1kkg xen/arch/i386/delay.c +3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/arch/i386/domain_page.c 3ddb79bcecupHj56ZbTa3B0FxDowMg xen/arch/i386/entry.S 3ddb79bcY5zW7KhvI9gvfuPi3ZumEg xen/arch/i386/extable.c 3fe443fdDDb0Sw6NQBCk4GQapayfTA xen/arch/i386/flushtlb.c @@ -154,7 +155,6 @@ 3ddb79bdLX_P6iB7ILiblRLWvebapg xen/common/dom0_ops.c 3e6377e4i0c9GtKN65e99OtRbw3AZw xen/common/dom_mem_ops.c 3ddb79bdYO5D8Av12NHqPeSviav7cg xen/common/domain.c -3e32af9aRnYGl4GMOaDKp7JdfhOGhg xen/common/domain_page.c 3ddb79bdeyutmaXEfpQvvxj7eQ0fCw xen/common/event.c 3fba5b96H0khoxNiKbjdi0inpXV-Pw xen/common/event_channel.c 3ddb79bd9drcFPVxd4w2GPOIjLlXpA xen/common/kernel.c @@ -600,18 +600,20 @@ 3e6377f5xwPfYZkPHPrDbEq1PRN7uQ xenolinux-2.4.25-sparse/arch/xeno/drivers/balloon/Makefile 3e6377f8Me8IqtvEhb70XFgOvqQH7A xenolinux-2.4.25-sparse/arch/xeno/drivers/balloon/balloon.c 3e5a4e65iHEuC5sjFhj42XALYbLVRw xenolinux-2.4.25-sparse/arch/xeno/drivers/block/Makefile -3e5a4e65pP5spJErBW69pJxSSdK9RA xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.c -3e67f822FOPwqHiaRKbrskgWgoNL5g xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.h -3e676eb5RXnHzSHgA1BvM0B1aIm4qg xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_vbd.c +3e5a4e65pP5spJErBW69pJxSSdK9RA xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.c +3e67f822FOPwqHiaRKbrskgWgoNL5g xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.h +3e676eb5RXnHzSHgA1BvM0B1aIm4qg xenolinux-2.4.25-sparse/arch/xeno/drivers/block/vbd.c 3e5a4e65G3e2s0ghPMgiJ-gBTUJ0uQ xenolinux-2.4.25-sparse/arch/xeno/drivers/console/Makefile 3e5a4e651TH-SXHoufurnWjgl5bfOA xenolinux-2.4.25-sparse/arch/xeno/drivers/console/console.c 3e5a4e656nfFISThfbyXQOA6HN6YHw xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/Makefile -3e5a4e65BXtftInNHUC2PjDfPhdZZA xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/dom0_core.c +3e5a4e65BXtftInNHUC2PjDfPhdZZA xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/core.c 3e5a4e65gfn_ltB8ujHMVFApnTTNRQ xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/vfr.c 40420a6ebRqDjufoN1WSJvolEW2Wjw xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/Makefile -40420a73Wou6JlsZDiu6YwjYomsm7A xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/xl_evtchn.c +40420a73Wou6JlsZDiu6YwjYomsm7A xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/evtchn.c 3e5a4e65gZBRBB6RsSVg1c9iahigAw xenolinux-2.4.25-sparse/arch/xeno/drivers/network/Makefile 3e5a4e65ZxKrbFetVB84JhrTyZ1YuQ xenolinux-2.4.25-sparse/arch/xeno/drivers/network/network.c +405853f2wg7JXZJNltspMwOZJklxgw xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/Makefile +405853f6nbeazrNyEWNHBuoSg2PiPA xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/vnetif.c 3e5a4e65lWzkiPXsZdzPt2RNnJGG1g xenolinux-2.4.25-sparse/arch/xeno/kernel/Makefile 3e5a4e65_hqfuxtGG8IUy6wRM86Ecg xenolinux-2.4.25-sparse/arch/xeno/kernel/entry.S 3e5a4e65Hy_1iUvMTPsNqGNXd9uFpg xenolinux-2.4.25-sparse/arch/xeno/kernel/head.S diff --git a/tools/xc/lib/xc_linux_build.c b/tools/xc/lib/xc_linux_build.c index 786d372c39..0659bb99c7 100644 --- a/tools/xc/lib/xc_linux_build.c +++ b/tools/xc/lib/xc_linux_build.c @@ -378,32 +378,32 @@ int xc_linux_build(int xc_handle, /* * Initial register values: - * DS,ES,FS,GS = FLAT_RING1_DS - * CS:EIP = FLAT_RING1_CS:start_pc - * SS:ESP = FLAT_RING1_DS:start_stack + * DS,ES,FS,GS = FLAT_GUESTOS_DS + * CS:EIP = FLAT_GUESTOS_CS:start_pc + * SS:ESP = FLAT_GUESTOS_DS:start_stack * ESI = start_info * [EAX,EBX,ECX,EDX,EDI,EBP are zero] * EFLAGS = IF | 2 (bit 1 is reserved and should always be 1) */ - ctxt->i386_ctxt.ds = FLAT_RING1_DS; - ctxt->i386_ctxt.es = FLAT_RING1_DS; - ctxt->i386_ctxt.fs = FLAT_RING1_DS; - ctxt->i386_ctxt.gs = FLAT_RING1_DS; - ctxt->i386_ctxt.ss = FLAT_RING1_DS; - ctxt->i386_ctxt.cs = FLAT_RING1_CS; - ctxt->i386_ctxt.eip = load_addr; - ctxt->i386_ctxt.esp = virt_startinfo_addr; - ctxt->i386_ctxt.esi = virt_startinfo_addr; - ctxt->i386_ctxt.eflags = (1<<9) | (1<<2); + ctxt->cpu_ctxt.ds = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.es = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.fs = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.gs = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.ss = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.cs = FLAT_GUESTOS_CS; + ctxt->cpu_ctxt.eip = load_addr; + ctxt->cpu_ctxt.esp = virt_startinfo_addr; + ctxt->cpu_ctxt.esi = virt_startinfo_addr; + ctxt->cpu_ctxt.eflags = (1<<9) | (1<<2); /* FPU is set up to default initial state. */ - memset(ctxt->i387_ctxt, 0, sizeof(ctxt->i387_ctxt)); + memset(ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); /* Virtual IDT is empty at start-of-day. */ for ( i = 0; i < 256; i++ ) { ctxt->trap_ctxt[i].vector = i; - ctxt->trap_ctxt[i].cs = FLAT_RING1_CS; + ctxt->trap_ctxt[i].cs = FLAT_GUESTOS_CS; } ctxt->fast_trap_idx = 0; @@ -414,16 +414,16 @@ int xc_linux_build(int xc_handle, ctxt->gdt_ents = 0; /* Ring 1 stack is the initial stack. */ - ctxt->ring1_ss = FLAT_RING1_DS; - ctxt->ring1_esp = virt_startinfo_addr; + ctxt->guestos_ss = FLAT_GUESTOS_DS; + ctxt->guestos_esp = virt_startinfo_addr; /* No debugging. */ memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg)); /* No callback handlers. */ - ctxt->event_callback_cs = FLAT_RING1_CS; + ctxt->event_callback_cs = FLAT_GUESTOS_CS; ctxt->event_callback_eip = 0; - ctxt->failsafe_callback_cs = FLAT_RING1_CS; + ctxt->failsafe_callback_cs = FLAT_GUESTOS_CS; ctxt->failsafe_callback_eip = 0; launch_op.u.builddomain.domain = (domid_t)domid; diff --git a/tools/xc/lib/xc_linux_restore.c b/tools/xc/lib/xc_linux_restore.c index 3b00a81011..f0a2127bcc 100644 --- a/tools/xc/lib/xc_linux_restore.c +++ b/tools/xc/lib/xc_linux_restore.c @@ -313,13 +313,13 @@ int xc_linux_restore(int xc_handle, verbose_printf("\b\b\b\b100%%\nMemory reloaded.\n"); /* Uncanonicalise the suspend-record frame number and poke resume rec. */ - pfn = ctxt.i386_ctxt.esi; + pfn = ctxt.cpu_ctxt.esi; if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) ) { ERROR("Suspend record frame number is bad"); goto out; } - ctxt.i386_ctxt.esi = mfn = pfn_to_mfn_table[pfn]; + ctxt.cpu_ctxt.esi = mfn = pfn_to_mfn_table[pfn]; p_srec = map_pfn_writeable(pm_handle, mfn); p_srec->resume_info.nr_pages = nr_pfns; p_srec->resume_info.shared_info = shared_info_frame << PAGE_SHIFT; @@ -370,13 +370,13 @@ int xc_linux_restore(int xc_handle, /* * Safety checking of saved context: - * 1. i386_ctxt is fine, as Xen checks that on context switch. - * 2. i387_ctxt is fine, as it can't hurt Xen. + * 1. cpu_ctxt is fine, as Xen checks that on context switch. + * 2. fpu_ctxt is fine, as it can't hurt Xen. * 3. trap_ctxt needs the code selectors checked. * 4. fast_trap_idx is checked by Xen. * 5. ldt base must be page-aligned, no more than 8192 ents, ... * 6. gdt already done, and further checking is done by Xen. - * 7. check that ring1_ss is safe. + * 7. check that guestos_ss is safe. * 8. pt_base is already done. * 9. debugregs are checked by Xen. * 10. callback code selectors need checking. @@ -385,14 +385,14 @@ int xc_linux_restore(int xc_handle, { ctxt.trap_ctxt[i].vector = i; if ( (ctxt.trap_ctxt[i].cs & 3) == 0 ) - ctxt.trap_ctxt[i].cs = FLAT_RING1_CS; + ctxt.trap_ctxt[i].cs = FLAT_GUESTOS_CS; } - if ( (ctxt.ring1_ss & 3) == 0 ) - ctxt.ring1_ss = FLAT_RING1_DS; + if ( (ctxt.guestos_ss & 3) == 0 ) + ctxt.guestos_ss = FLAT_GUESTOS_DS; if ( (ctxt.event_callback_cs & 3) == 0 ) - ctxt.event_callback_cs = FLAT_RING1_CS; + ctxt.event_callback_cs = FLAT_GUESTOS_CS; if ( (ctxt.failsafe_callback_cs & 3) == 0 ) - ctxt.failsafe_callback_cs = FLAT_RING1_CS; + ctxt.failsafe_callback_cs = FLAT_GUESTOS_CS; if ( ((ctxt.ldt_base & (PAGE_SIZE - 1)) != 0) || (ctxt.ldt_ents > 8192) || (ctxt.ldt_base > HYPERVISOR_VIRT_START) || diff --git a/tools/xc/lib/xc_linux_save.c b/tools/xc/lib/xc_linux_save.c index 1695bd63c0..aece21a017 100644 --- a/tools/xc/lib/xc_linux_save.c +++ b/tools/xc/lib/xc_linux_save.c @@ -182,14 +182,14 @@ int xc_linux_save(int xc_handle, goto out; /* Is the suspend-record MFN actually valid for this domain? */ - if ( !check_pfn_ownership(xc_handle, ctxt.i386_ctxt.esi, domid) ) + if ( !check_pfn_ownership(xc_handle, ctxt.cpu_ctxt.esi, domid) ) { ERROR("Invalid state record pointer"); goto out; } /* If the suspend-record MFN is okay then grab a copy of it to @srec. */ - p_srec = map_pfn_readonly(pm_handle, ctxt.i386_ctxt.esi); + p_srec = map_pfn_readonly(pm_handle, ctxt.cpu_ctxt.esi); memcpy(&srec, p_srec, sizeof(srec)); unmap_pfn(pm_handle, p_srec); @@ -272,7 +272,7 @@ int xc_linux_save(int xc_handle, } /* Canonicalise the suspend-record frame number. */ - if ( !translate_mfn_to_pfn(&ctxt.i386_ctxt.esi) ) + if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) ) { ERROR("State record is not in range of pseudophys map"); goto out; diff --git a/tools/xc/lib/xc_netbsd_build.c b/tools/xc/lib/xc_netbsd_build.c index 56fd35dbab..3472f32257 100644 --- a/tools/xc/lib/xc_netbsd_build.c +++ b/tools/xc/lib/xc_netbsd_build.c @@ -273,32 +273,32 @@ int xc_netbsd_build(int xc_handle, /* * Initial register values: - * DS,ES,FS,GS = FLAT_RING1_DS - * CS:EIP = FLAT_RING1_CS:start_pc - * SS:ESP = FLAT_RING1_DS:start_stack + * DS,ES,FS,GS = FLAT_GUESTOS_DS + * CS:EIP = FLAT_GUESTOS_CS:start_pc + * SS:ESP = FLAT_GUESTOS_DS:start_stack * ESI = start_info * [EAX,EBX,ECX,EDX,EDI,EBP are zero] * EFLAGS = IF | 2 (bit 1 is reserved and should always be 1) */ - ctxt->i386_ctxt.ds = FLAT_RING1_DS; - ctxt->i386_ctxt.es = FLAT_RING1_DS; - ctxt->i386_ctxt.fs = FLAT_RING1_DS; - ctxt->i386_ctxt.gs = FLAT_RING1_DS; - ctxt->i386_ctxt.ss = FLAT_RING1_DS; - ctxt->i386_ctxt.cs = FLAT_RING1_CS; - ctxt->i386_ctxt.eip = load_addr; - ctxt->i386_ctxt.esp = virt_startinfo_addr; - ctxt->i386_ctxt.esi = virt_startinfo_addr; - ctxt->i386_ctxt.eflags = (1<<9) | (1<<2); + ctxt->cpu_ctxt.ds = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.es = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.fs = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.gs = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.ss = FLAT_GUESTOS_DS; + ctxt->cpu_ctxt.cs = FLAT_GUESTOS_CS; + ctxt->cpu_ctxt.eip = load_addr; + ctxt->cpu_ctxt.esp = virt_startinfo_addr; + ctxt->cpu_ctxt.esi = virt_startinfo_addr; + ctxt->cpu_ctxt.eflags = (1<<9) | (1<<2); /* FPU is set up to default initial state. */ - memset(ctxt->i387_ctxt, 0, sizeof(ctxt->i387_ctxt)); + memset(ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); /* Virtual IDT is empty at start-of-day. */ for ( i = 0; i < 256; i++ ) { ctxt->trap_ctxt[i].vector = i; - ctxt->trap_ctxt[i].cs = FLAT_RING1_CS; + ctxt->trap_ctxt[i].cs = FLAT_GUESTOS_CS; } ctxt->fast_trap_idx = 0; @@ -309,16 +309,16 @@ int xc_netbsd_build(int xc_handle, ctxt->gdt_ents = 0; /* Ring 1 stack is the initial stack. */ - ctxt->ring1_ss = FLAT_RING1_DS; - ctxt->ring1_esp = virt_startinfo_addr; + ctxt->guestos_ss = FLAT_GUESTOS_DS; + ctxt->guestos_esp = virt_startinfo_addr; /* No debugging. */ memset(ctxt->debugreg, 0, sizeof(ctxt->debugreg)); /* No callback handlers. */ - ctxt->event_callback_cs = FLAT_RING1_CS; + ctxt->event_callback_cs = FLAT_GUESTOS_CS; ctxt->event_callback_eip = 0; - ctxt->failsafe_callback_cs = FLAT_RING1_CS; + ctxt->failsafe_callback_cs = FLAT_GUESTOS_CS; ctxt->failsafe_callback_eip = 0; launch_op.u.builddomain.domain = (domid_t)domid; diff --git a/xen/common/domain_page.c b/xen/arch/i386/domain_page.c similarity index 100% rename from xen/common/domain_page.c rename to xen/arch/i386/domain_page.c diff --git a/xen/arch/i386/mm.c b/xen/arch/i386/mm.c index 15dadf35b7..e892c61953 100644 --- a/xen/arch/i386/mm.c +++ b/xen/arch/i386/mm.c @@ -131,8 +131,8 @@ long do_stack_switch(unsigned long ss, unsigned long esp) if ( (ss & 3) == 0 ) return -EPERM; - current->thread.ss1 = ss; - current->thread.esp1 = esp; + current->thread.guestos_ss = ss; + current->thread.guestos_sp = esp; t->ss1 = ss; t->esp1 = esp; diff --git a/xen/arch/i386/process.c b/xen/arch/i386/process.c index 09170307a7..bcbc5f550a 100644 --- a/xen/arch/i386/process.c +++ b/xen/arch/i386/process.c @@ -264,8 +264,8 @@ void switch_to(struct task_struct *prev_p, struct task_struct *next_p) SET_FAST_TRAP(&next_p->thread); /* Switch the guest OS ring-1 stack. */ - tss->esp1 = next->esp1; - tss->ss1 = next->ss1; + tss->esp1 = next->guestos_sp; + tss->ss1 = next->guestos_ss; /* Maybe switch the debug registers. */ if ( next->debugreg[7] ) diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index e359026371..b39ead491c 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -290,23 +290,25 @@ long do_dom0_op(dom0_op_t *u_dom0_op) { rmb(); /* Ensure that we see saved register state. */ op->u.getdomaininfo.ctxt.flags = 0; - memcpy(&op->u.getdomaininfo.ctxt.i386_ctxt, + memcpy(&op->u.getdomaininfo.ctxt.cpu_ctxt, &p->shared_info->execution_context, sizeof(p->shared_info->execution_context)); if ( test_bit(PF_DONEFPUINIT, &p->flags) ) op->u.getdomaininfo.ctxt.flags |= ECF_I387_VALID; - memcpy(&op->u.getdomaininfo.ctxt.i387_ctxt, + memcpy(&op->u.getdomaininfo.ctxt.fpu_ctxt, &p->thread.i387, sizeof(p->thread.i387)); memcpy(&op->u.getdomaininfo.ctxt.trap_ctxt, p->thread.traps, sizeof(p->thread.traps)); +#ifdef ARCH_HAS_FAST_TRAP if ( (p->thread.fast_trap_desc.a == 0) && (p->thread.fast_trap_desc.b == 0) ) op->u.getdomaininfo.ctxt.fast_trap_idx = 0; else op->u.getdomaininfo.ctxt.fast_trap_idx = p->thread.fast_trap_idx; +#endif op->u.getdomaininfo.ctxt.ldt_base = p->mm.ldt_base; op->u.getdomaininfo.ctxt.ldt_ents = p->mm.ldt_ents; op->u.getdomaininfo.ctxt.gdt_ents = 0; @@ -318,8 +320,8 @@ long do_dom0_op(dom0_op_t *u_dom0_op) op->u.getdomaininfo.ctxt.gdt_ents = (GET_GDT_ENTRIES(p) + 1) >> 3; } - op->u.getdomaininfo.ctxt.ring1_ss = p->thread.ss1; - op->u.getdomaininfo.ctxt.ring1_esp = p->thread.esp1; + op->u.getdomaininfo.ctxt.guestos_ss = p->thread.guestos_ss; + op->u.getdomaininfo.ctxt.guestos_esp = p->thread.guestos_sp; op->u.getdomaininfo.ctxt.pt_base = pagetable_val(p->mm.pagetable); memcpy(op->u.getdomaininfo.ctxt.debugreg, @@ -371,6 +373,12 @@ long do_dom0_op(dom0_op_t *u_dom0_op) case PGT_l2_page_table: op->u.getpageframeinfo.type = L2TAB; break; + case PGT_l3_page_table: + op->u.getpageframeinfo.type = L3TAB; + break; + case PGT_l4_page_table: + op->u.getpageframeinfo.type = L4TAB; + break; } } diff --git a/xen/common/domain.c b/xen/common/domain.c index 53cea06285..8921ee246d 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -19,13 +19,16 @@ #include #include -/* - * NB. No ring-3 access in initial guestOS pagetables. Note that we allow - * ring-3 privileges in the page directories, so that the guestOS may later - * decide to share a 4MB region with applications. - */ +#if !defined(CONFIG_X86_64BITMODE) +/* No ring-3 access in initial page tables. */ #define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED) +#else +/* Allow ring-3 access in long mode as guest cannot use ring 1. */ +#define L1_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_USER) +#endif #define L2_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) +#define L3_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) +#define L4_PROT (_PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER) /* Both these structures are protected by the tasklist_lock. */ rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; @@ -426,20 +429,20 @@ void free_all_dom_mem(struct task_struct *p) put_page(page); /* - * Forcibly invalidate L2 tables at this point to break circular + * Forcibly invalidate base page tables at this point to break circular * 'linear page table' references. This is okay because MMU structures - * are not shared across domains and this domain is now dead. Thus L2 + * are not shared across domains and this domain is now dead. Thus base * tables are not in use so a non-zero count means circular reference. */ y = page->type_and_flags; do { x = y; if ( likely((x & (PGT_type_mask|PGT_validated)) != - (PGT_l2_page_table|PGT_validated)) ) + (PGT_base_page_table|PGT_validated)) ) break; y = cmpxchg(&page->type_and_flags, x, x & ~PGT_validated); if ( likely(y == x) ) - free_page_type(page, PGT_l2_page_table); + free_page_type(page, PGT_base_page_table); } while ( unlikely(y != x) ); @@ -504,7 +507,7 @@ void release_task(struct task_struct *p) */ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain) { - unsigned long phys_l2tab; + unsigned long phys_basetab; int i; if ( test_bit(PF_CONSTRUCTED, &p->flags) ) @@ -514,16 +517,18 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain) if ( builddomain->ctxt.flags & ECF_I387_VALID ) set_bit(PF_DONEFPUINIT, &p->flags); memcpy(&p->shared_info->execution_context, - &builddomain->ctxt.i386_ctxt, + &builddomain->ctxt.cpu_ctxt, sizeof(p->shared_info->execution_context)); memcpy(&p->thread.i387, - &builddomain->ctxt.i387_ctxt, + &builddomain->ctxt.fpu_ctxt, sizeof(p->thread.i387)); memcpy(p->thread.traps, &builddomain->ctxt.trap_ctxt, sizeof(p->thread.traps)); +#ifdef ARCH_HAS_FAST_TRAP SET_DEFAULT_FAST_TRAP(&p->thread); (void)set_fast_trap(p, builddomain->ctxt.fast_trap_idx); +#endif p->mm.ldt_base = builddomain->ctxt.ldt_base; p->mm.ldt_ents = builddomain->ctxt.ldt_ents; SET_GDT_ENTRIES(p, DEFAULT_GDT_ENTRIES); @@ -532,8 +537,8 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain) (void)set_gdt(p, builddomain->ctxt.gdt_frames, builddomain->ctxt.gdt_ents); - p->thread.ss1 = builddomain->ctxt.ring1_ss; - p->thread.esp1 = builddomain->ctxt.ring1_esp; + p->thread.guestos_ss = builddomain->ctxt.guestos_ss; + p->thread.guestos_sp = builddomain->ctxt.guestos_esp; for ( i = 0; i < 8; i++ ) (void)set_debugreg(p, i, builddomain->ctxt.debugreg[i]); p->event_selector = builddomain->ctxt.event_callback_cs; @@ -541,10 +546,10 @@ int final_setup_guestos(struct task_struct *p, dom0_builddomain_t *builddomain) p->failsafe_selector = builddomain->ctxt.failsafe_callback_cs; p->failsafe_address = builddomain->ctxt.failsafe_callback_eip; - phys_l2tab = builddomain->ctxt.pt_base; - p->mm.pagetable = mk_pagetable(phys_l2tab); - get_page_and_type(&frame_table[phys_l2tab>>PAGE_SHIFT], p, - PGT_l2_page_table); + phys_basetab = builddomain->ctxt.pt_base; + p->mm.pagetable = mk_pagetable(phys_basetab); + get_page_and_type(&frame_table[phys_basetab>>PAGE_SHIFT], p, + PGT_base_page_table); /* Set up the shared info structure. */ update_dom_time(p->shared_info); @@ -620,6 +625,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, if ( strncmp(data_start, "XenoGues", 8) ) { printk("DOM%llu: Invalid guest OS image\n", dom); + unmap_domain_mem(data_start); return -1; } @@ -628,12 +634,14 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, { printk("DOM%llu: Guest OS load address not page-aligned (%08lx)\n", dom, virt_load_address); + unmap_domain_mem(data_start); return -1; } if ( alloc_new_dom_mem(p, params->memory_kb) ) { printk("DOM%llu: Not enough memory --- reduce dom0_mem ??\n", dom); + unmap_domain_mem(data_start); return -ENOMEM; } @@ -650,6 +658,7 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, dom, data_len>>20, (params->memory_kb)>>11, (params->memory_kb)>>10); + unmap_domain_mem(data_start); free_all_dom_mem(p); return -1; } @@ -664,11 +673,11 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, * We're basically forcing default RPLs to 1, so that our "what privilege * level are we returning to?" logic works. */ - p->failsafe_selector = FLAT_RING1_CS; - p->event_selector = FLAT_RING1_CS; - p->thread.ss1 = FLAT_RING1_DS; + p->failsafe_selector = FLAT_GUESTOS_CS; + p->event_selector = FLAT_GUESTOS_CS; + p->thread.guestos_ss = FLAT_GUESTOS_DS; for ( i = 0; i < 256; i++ ) - p->thread.traps[i].cs = FLAT_RING1_CS; + p->thread.traps[i].cs = FLAT_GUESTOS_CS; /* * WARNING: The new domain must have its 'processor' field @@ -770,11 +779,11 @@ int setup_guestos(struct task_struct *p, dom0_createdomain_t *params, src++; if ( (((unsigned long)src) & (PAGE_SIZE-1)) == 0 ) { - unmap_domain_mem( vsrc-1 ); - vsrc = map_domain_mem( (unsigned long)src ); + unmap_domain_mem(vsrc-1); + vsrc = map_domain_mem((unsigned long)src); } } - unmap_domain_mem( vsrc ); + unmap_domain_mem(vsrc); /* Set up start info area. */ memset(virt_startinfo_address, 0, sizeof(*virt_startinfo_address)); diff --git a/xen/common/kernel.c b/xen/common/kernel.c index b963c6f5e9..f99f3fac32 100644 --- a/xen/common/kernel.c +++ b/xen/common/kernel.c @@ -190,16 +190,22 @@ void cmain(unsigned long magic, multiboot_info_t *mbi) for ( ; ; ) ; } + max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10); + /* The array of pfn_info structures must fit into the reserved area. */ - if ( sizeof(struct pfn_info) > 24 ) + if ( (sizeof(struct pfn_info) * max_page) > + (FRAMETABLE_VIRT_END - FRAMETABLE_VIRT_START) ) { - printk("'struct pfn_info' too large to fit in Xen address space!\n"); - for ( ; ; ) ; + unsigned long new_max = + (FRAMETABLE_VIRT_END - FRAMETABLE_VIRT_START) / + sizeof(struct pfn_info); + printk("Truncating available memory to %lu/%luMB\n", + new_max >> (20 - PAGE_SHIFT), max_page >> (20 - PAGE_SHIFT)); + max_page = new_max; } set_current(&idle0_task); - max_page = (mbi->mem_upper+1024) >> (PAGE_SHIFT - 10); init_frametable(max_page); printk("Initialised all memory on a %luMB machine\n", max_page >> (20-PAGE_SHIFT)); diff --git a/xen/common/lib.c b/xen/common/lib.c index 3d7cc8c00e..6c8f57875d 100644 --- a/xen/common/lib.c +++ b/xen/common/lib.c @@ -2,125 +2,6 @@ #include #include -#if 0 // jws - now in string.c, string.h, asm/string.h -int memcmp(const void * cs,const void * ct,size_t count) -{ - const unsigned char *su1, *su2; - signed char res = 0; - - for( su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--) - if ((res = *su1 - *su2) != 0) - break; - return res; -} - -void * memcpy(void * dest,const void *src,size_t count) -{ - char *tmp = (char *) dest, *s = (char *) src; - - while (count--) - *tmp++ = *s++; - - return dest; -} - -int strncmp(const char * cs,const char * ct,size_t count) -{ - register signed char __res = 0; - - while (count) { - if ((__res = *cs - *ct++) != 0 || !*cs++) - break; - count--; - } - - return __res; -} - -int strcmp(const char * cs,const char * ct) -{ - register signed char __res; - - while (1) { - if ((__res = *cs - *ct++) != 0 || !*cs++) - break; - } - - return __res; -} - -char * strcpy(char * dest,const char *src) -{ - char *tmp = dest; - - while ((*dest++ = *src++) != '\0') - /* nothing */; - return tmp; -} - -char * strncpy(char * dest,const char *src,size_t count) -{ - char *tmp = dest; - - while (count-- && (*dest++ = *src++) != '\0') - /* nothing */; - - return tmp; -} - -void * memset(void * s,int c,size_t count) -{ - char *xs = (char *) s; - - while (count--) - *xs++ = c; - - return s; -} - -size_t strnlen(const char * s, size_t count) -{ - const char *sc; - - for (sc = s; count-- && *sc != '\0'; ++sc) - /* nothing */; - return sc - s; -} - -size_t strlen(const char * s) -{ - const char *sc; - - for (sc = s; *sc != '\0'; ++sc) - /* nothing */; - return sc - s; -} - -char * strchr(const char * s, int c) -{ - for(; *s != (char) c; ++s) - if (*s == '\0') - return NULL; - return (char *) s; -} - -char * strstr(const char * s1,const char * s2) -{ - int l1, l2; - - l2 = strlen(s2); - if (!l2) - return (char *) s1; - l1 = strlen(s1); - while (l1 >= l2) { - l1--; - if (!memcmp(s1,s2,l2)) - return (char *) s1; - s1++; - } - return NULL; -} -#endif /* for inc/ctype.h */ unsigned char _ctype[] = { @@ -213,6 +94,7 @@ unsigned char *quad_to_str(unsigned long q, unsigned char *s) #include +#if BITS_PER_LONG == 32 /* * Depending on the desired operation, we view a `long long' (aka quad_t) in @@ -526,7 +408,7 @@ __udivdi3(a, b) return (__qdivrem(a, b, (u64 *)0)); } - +#endif /* BITS_PER_LONG == 32 */ /* HASH/RANDOMISATION FUNCTION @@ -535,8 +417,6 @@ __udivdi3(a, b) * See http://burlteburtle.net/bob/hash/evahash.html */ -typedef unsigned long ub4; - #define mix(a,b,c) \ do { \ a -= b; a -= c; a ^= (c>>13); \ @@ -550,9 +430,9 @@ typedef unsigned long ub4; c -= a; c -= b; c = (c ^ (b>>15)) & 0xffffffff; \ } while ( 0 ) -unsigned long hash(unsigned char *k, unsigned long len) +u32 hash(unsigned char *k, unsigned long len) { - unsigned long a, b, c, l; + u32 a, b, c, l; l = len; a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ @@ -560,9 +440,9 @@ unsigned long hash(unsigned char *k, unsigned long len) while ( l >= 12 ) { - a += (k[0] + ((ub4)k[1]<<8) + ((ub4)k[2]<<16) + ((ub4)k[3]<<24)); - b += (k[4] + ((ub4)k[5]<<8) + ((ub4)k[6]<<16) + ((ub4)k[7]<<24)); - c += (k[8] + ((ub4)k[9]<<8) + ((ub4)k[10]<<16) + ((ub4)k[11]<<24)); + a += (k[0] + ((u32)k[1]<<8) + ((u32)k[2]<<16) + ((u32)k[3]<<24)); + b += (k[4] + ((u32)k[5]<<8) + ((u32)k[6]<<16) + ((u32)k[7]<<24)); + c += (k[8] + ((u32)k[9]<<8) + ((u32)k[10]<<16) + ((u32)k[11]<<24)); mix(a,b,c); k += 12; l -= 12; } @@ -570,17 +450,17 @@ unsigned long hash(unsigned char *k, unsigned long len) c += len; switch ( l ) { - case 11: c+=((ub4)k[10]<<24); - case 10: c+=((ub4)k[9]<<16); - case 9 : c+=((ub4)k[8]<<8); + case 11: c+=((u32)k[10]<<24); + case 10: c+=((u32)k[9]<<16); + case 9 : c+=((u32)k[8]<<8); /* the first byte of c is reserved for the length */ - case 8 : b+=((ub4)k[7]<<24); - case 7 : b+=((ub4)k[6]<<16); - case 6 : b+=((ub4)k[5]<<8); + case 8 : b+=((u32)k[7]<<24); + case 7 : b+=((u32)k[6]<<16); + case 6 : b+=((u32)k[5]<<8); case 5 : b+=k[4]; - case 4 : a+=((ub4)k[3]<<24); - case 3 : a+=((ub4)k[2]<<16); - case 2 : a+=((ub4)k[1]<<8); + case 4 : a+=((u32)k[3]<<24); + case 3 : a+=((u32)k[2]<<16); + case 2 : a+=((u32)k[1]<<8); case 1 : a+=k[0]; /* case 0: nothing left to add */ } diff --git a/xen/include/asm-i386/config.h b/xen/include/asm-i386/config.h index 3dd2986492..3e70edceca 100644 --- a/xen/include/asm-i386/config.h +++ b/xen/include/asm-i386/config.h @@ -118,6 +118,8 @@ SYMBOL_NAME_LABEL(name) #endif +#define PGT_base_page_table PGT_l2_page_table + #define barrier() __asm__ __volatile__("": : :"memory") #define __HYPERVISOR_CS 0x0808 diff --git a/xen/include/asm-i386/processor.h b/xen/include/asm-i386/processor.h index c7df85aa28..0debc605db 100644 --- a/xen/include/asm-i386/processor.h +++ b/xen/include/asm-i386/processor.h @@ -354,7 +354,7 @@ struct tss_struct { }; struct thread_struct { - unsigned long esp1, ss1; + unsigned long guestos_sp, guestos_ss; /* Hardware debugging registers */ unsigned long debugreg[8]; /* %%db0-7 debug registers */ /* floating point info */ diff --git a/xen/include/asm-i386/types.h b/xen/include/asm-i386/types.h index 2bd0f258b9..0bba049f79 100644 --- a/xen/include/asm-i386/types.h +++ b/xen/include/asm-i386/types.h @@ -3,6 +3,8 @@ typedef unsigned short umode_t; +typedef unsigned int size_t; + /* * __xx is ok: it doesn't pollute the POSIX namespace. Use these in the * header files exported to user space diff --git a/xen/include/asm-x86_64/atomic.h b/xen/include/asm-x86_64/atomic.h index f4eb858312..89e564a854 100644 --- a/xen/include/asm-x86_64/atomic.h +++ b/xen/include/asm-x86_64/atomic.h @@ -1,5 +1,5 @@ -#ifndef __ARCH_I386_ATOMIC__ -#define __ARCH_I386_ATOMIC__ +#ifndef __ARCH_X86_64_ATOMIC__ +#define __ARCH_X86_64_ATOMIC__ #include @@ -202,4 +202,4 @@ __asm__ __volatile__(LOCK "orl %0,%1" \ #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() -#endif +#endif /* __ARCH_X86_64_ATOMIC__ */ diff --git a/xen/include/asm-x86_64/config.h b/xen/include/asm-x86_64/config.h index 2fed6b7959..8b7c97612a 100644 --- a/xen/include/asm-x86_64/config.h +++ b/xen/include/asm-x86_64/config.h @@ -2,12 +2,14 @@ * config.h * * A Linux-style configuration list. + * */ #ifndef __XENO_X86_64_CONFIG_H__ #define __XENO_X86_64_CONFIG_H__ -#define CONFIG_X86 1 +#define CONFIG_X86 1 +#define CONFIG_X86_64BITMODE 1 #define CONFIG_SMP 1 #define CONFIG_X86_LOCAL_APIC 1 @@ -53,63 +55,200 @@ #define __cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) #define ____cacheline_aligned __cacheline_aligned +#define PHYSICAL_ADDRESS_BITS 52 +#define MAX_PHYSICAL_ADDRESS (1 << PHYSICAL_ADDRESS_BITS) +#define VIRTUAL_ADDRESS_BITS 48 +#define XEN_PAGE_SIZE 4096 + +#define PTE_SIZE 8 +#define TOTAL_PTES (512ULL * 512 * 512 * 512) + +/* next PML4 from an _END address */ +#define PML4_BITS 39 +#define PML4_SPACE (1ULL << PML4_BITS) + /* - * Virtual addresses beyond this are not modifiable by guest OSes. The - * machine->physical mapping table starts at this address, read-only. + * Memory layout + * + * 0x0000000000000000 - 0x00007fffffffffff Guest & user apps (128TB) + * (Only for 32-bit guests) + * 0x00000000fc000000 - 0x00000000fc3fffff Machine/Physical 32-bit shadow (4MB) + * 0x00000000fc400000 - 0x00000000feffffff IO remap for 32-bit guests (44MB) + * 0x00000000ff000000 - 0x00000000ff3fffff 32-bit PTE shadow (4MB) + * + * 0xffff800000000000 - 0xffff807fffffffff Linear page table (512GB) + * 0xffff808000000000 - 0xffff80ffffffffff Reserved for shadow page table (512GB) + * + * 0xffff810000000000 - 0xffff82ffffffffff Xen PML4 slots + * 0xffff810000000000 - 0xffff81003fffffff Xen hypervisor virtual space (1GB) + * 0xffff810040000000 - 0xffff81807fffffff Per-domain mappings (1GB) + * 0xffff810080000000 - 0xffff81387fffffff R/O physical map (224GB) + * 0xffff813880000000 - 0xffff81707fffffff R/W physical map (224GB) + * 0xffff817080000000 - 0xffff82c07fffffff Frame table (1344GB) + * 0xffff82c080000000 - 0xffff82c0bfffffff I/O remap space (1GB) + * 0xffff82c0c0000000 - 0xffff82ffffffffff (253GB) + * + * 0xffff830000000000 - 0xffff87ffffffffff RESERVED (5TB) + * + * 0xffff880000000000 - ... Physical 1:1 direct mapping (112TB max) + * 0xffff880000000000 - 0xffff880001000000 Low memory DMA region (16M) + * + * 0xfffff80000000000 - 0xffffffffffffffff Reserved for guest (8TB) + * + * The requirement that we have a 1:1 map of physical memory limits + * the maximum memory size we can support. With only 48 virtual address + * bits, and the assumption that guests will run users in positive address + * space, a contiguous 1:1 map can only live in the negative address space. + * Since we don't want to bump guests out of the very top of memory and + * force relocation, we can't use this entire space, and Xen has several + * heavy mapping that require PML4 slices. Just to be safe, we reserve + * 16 PML4s each for Xen and the guest. 224 PML4s give us 112 terabytes + * of addressable memory. Any high device physical addresses beyond this + * region can be mapped into the IO remap space or some of the reserved + * 6TB region. + * + * 112 TB is just 16 TB shy of the maximum physical memory supported + * on Linux 2.6.0, and should be enough for anybody. + * + * There are some additional constraints in the memory layout that require + * several changes from the i386 architecture. + * + * ACPI data and ACPI non-volatile storage must be placed in some region + * of memory below the 4GB mark. Depending on the BIOS and system, we + * may have this located as low as 1GB. This means allocating large + * chunks of physically contiguous memory from the direct mapping may not + * be possible. + * + * The full frame table for 112TB of physical memory currently occupies + * 1344GB space. This clearly can not be allocated in physically contiguous + * space, so it must be moved to a virtual address. + * + * Both copies of the machine->physical table must also be relocated. + * (112 TB / 4k) * 8 bytes means that each copy of the physical map requires + * 224GB of space, thus it also must move to VM space. + * + * The physical pages used to allocate the page tables for the direct 1:1 + * map may occupy (112TB / 2M) * 8 bytes = 448MB. This is almost guaranteed + * to fit in contiguous physical memory, but these pages used to be allocated + * in the Xen monitor address space. This means the Xen address space must + * accomodate up to ~500 MB, which means it also must move out of the + * direct mapped region. + * + * Since both copies of the MPT, the frame table, and Xen now exist in + * purely virtual space, we have the added advantage of being able to + * map them to local pages on NUMA machines, or use NUMA aware memory + * allocation within Xen itself. + * + * Additionally, the 1:1 page table now exists contiguously in virtual + * space, but may be mapped to physically separated pages, allowing + * each node to contain the page tables for its own local memory. Setting + * up this mapping presents a bit of a chicken-egg problem, but is possible + * as a future enhancement. + * + * Zachary Amsden (zamsden@cisco.com) + * */ -#define HYPERVISOR_VIRT_START (0xFFFF800000000000ULL) - + +/* Guest and user space */ +#define NSPACE_VIRT_START 0 +#define NSPACE_VIRT_END (1ULL << (VIRTUAL_ADDRESS_BITS - 1)) + +/* Priviledged space */ +#define ESPACE_VIRT_END 0 +#define ESPACE_VIRT_START (ESPACE_VIRT_END-(1ULL << (VIRTUAL_ADDRESS_BITS-1))) + +/* reservations in e-space */ +#define GUEST_RESERVED_PML4S 16 +#define XEN_RESERVED_PML4S 16 + +#define MAX_MEMORY_SIZE ((1ULL << (VIRTUAL_ADDRESS_BITS-1)) \ + -((GUEST_RESERVED_PML4S + XEN_RESERVED_PML4S) * PML4_SPACE)) +#define MAX_MEMORY_FRAMES (MAX_MEMORY_SIZE / XEN_PAGE_SIZE) + /* - * Xen exists in the highest 2GB of address space for RIP-relative - * addressing + * Virtual addresses beyond this are not modifiable by guest OSes. */ -#define XEN_VIRT_START (0xFFFFFFFF80000000ULL) - +#define HYPERVISOR_VIRT_START ESPACE_VIRT_START +#define HYPERVISOR_VIRT_END (ESPACE_VIRT_END-(GUEST_RESERVED_PML4S * PML4_SPACE)) + +/* First 512GB of virtual address space is used as a linear p.t. mapping. */ +#define LINEAR_PT_VIRT_START (HYPERVISOR_VIRT_START) +#define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + (PTE_SIZE * TOTAL_PTES)) + +/* Reserve some space for a shadow PT mapping */ +#define SHADOW_PT_VIRT_START (LINEAR_PT_VIRT_END) +#define SHADOW_PT_VIRT_END (SHADOW_PT_VIRT_START + (PTE_SIZE * TOTAL_PTES)) + +/* Xen exists in the first 1GB of the next PML4 space */ +#define MAX_MONITOR_ADDRESS (1 * 1024 * 1024 * 1024) +#define MONITOR_VIRT_START (SHADOW_PT_VIRT_END) +#define MONITOR_VIRT_END (MONITOR_VIRT_START + MAX_MONITOR_ADDRESS) + +/* Next 1GB of virtual address space used for per-domain mappings (eg. GDT). */ +#define PERDOMAIN_VIRT_START (MONITOR_VIRT_END) +#define PERDOMAIN_VIRT_END (PERDOMAIN_VIRT_START + (512 * 512 * 4096)) +#define GDT_VIRT_START (PERDOMAIN_VIRT_START) +#define GDT_VIRT_END (GDT_VIRT_START + (128*1024)) +#define LDT_VIRT_START (GDT_VIRT_END) +#define LDT_VIRT_END (LDT_VIRT_START + (128*1024)) + /* - * First 4MB are mapped read-only for all. It's for the machine->physical + * First set of MPTs are mapped read-only for all. It's for the machine->physical * mapping table (MPT table). The following are virtual addresses. */ -#define READONLY_MPT_VIRT_START (HYPERVISOR_VIRT_START) -#define READONLY_MPT_VIRT_END (READONLY_MPT_VIRT_START + (4*1024*1024)) +#define READONLY_MPT_VIRT_START (PERDOMAIN_VIRT_END) +#define READONLY_MPT_VIRT_END (READONLY_MPT_VIRT_START + (PTE_SIZE * MAX_MEMORY_FRAMES)) + +/* R/W machine->physical table */ +#define RDWR_MPT_VIRT_START (READONLY_MPT_VIRT_END) +#define RDWR_MPT_VIRT_END (RDWR_MPT_VIRT_START + (PTE_SIZE * MAX_MEMORY_FRAMES)) + +/* Frame table */ +#define FRAMETABLE_ENTRY_SIZE (48) +#define FRAMETABLE_VIRT_START (RDWR_MPT_VIRT_END) +#define FRAMETABLE_VIRT_END (FRAMETABLE_VIRT_START + (FRAMETABLE_ENTRY_SIZE * MAX_MEMORY_FRAMES)) + +/* Next 1GB of virtual address space used for ioremap(). */ +#define IOREMAP_VIRT_START (FRAMETABLE_VIRT_END) +#define IOREMAP_VIRT_END (IOREMAP_VIRT_START + (512 * 512 * 4096)) + +/* And the virtual addresses for the direct-map region... */ +#define DIRECTMAP_VIRT_START (ESPACE_VIRT_START + (XEN_RESERVED_PML4S * PML4_SPACE)) +#define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + MAX_DIRECTMAP_ADDRESS) + /* - * Next 16MB is fixed monitor space, which is part of a 44MB direct-mapped - * memory region. The following are machine addresses. + * Next is the direct-mapped memory region. The following are machine addresses. */ -#define MAX_MONITOR_ADDRESS (16*1024*1024) #define MAX_DMA_ADDRESS (16*1024*1024) -#define MAX_DIRECTMAP_ADDRESS (44*1024*1024) -/* And the virtual addresses for the direct-map region... */ -#define DIRECTMAP_VIRT_START (READONLY_MPT_VIRT_END) -#define DIRECTMAP_VIRT_END (DIRECTMAP_VIRT_START + MAX_DIRECTMAP_ADDRESS) -#define MONITOR_VIRT_START (DIRECTMAP_VIRT_START) -#define MONITOR_VIRT_END (MONITOR_VIRT_START + MAX_MONITOR_ADDRESS) -#define RDWR_MPT_VIRT_START (MONITOR_VIRT_END) -#define RDWR_MPT_VIRT_END (RDWR_MPT_VIRT_START + (4*1024*1024)) -#define FRAMETABLE_VIRT_START (RDWR_MPT_VIRT_END) -#define FRAMETABLE_VIRT_END (DIRECTMAP_VIRT_END) -/* Next 4MB of virtual address space is used as a linear p.t. mapping. */ -#define LINEAR_PT_VIRT_START (DIRECTMAP_VIRT_END) -#define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + (4*1024*1024)) -/* Next 4MB of virtual address space used for per-domain mappings (eg. GDT). */ -#define PERDOMAIN_VIRT_START (LINEAR_PT_VIRT_END) -#define PERDOMAIN_VIRT_END (PERDOMAIN_VIRT_START + (4*1024*1024)) -#define GDT_VIRT_START (PERDOMAIN_VIRT_START) -#define GDT_VIRT_END (GDT_VIRT_START + (64*1024)) -#define LDT_VIRT_START (GDT_VIRT_END) -#define LDT_VIRT_END (LDT_VIRT_START + (64*1024)) -/* Penultimate 4MB of virtual address space used for domain page mappings. */ -#define MAPCACHE_VIRT_START (PERDOMAIN_VIRT_END) -#define MAPCACHE_VIRT_END (MAPCACHE_VIRT_START + (4*1024*1024)) -/* Final 4MB of virtual address space used for ioremap(). */ -#define IOREMAP_VIRT_START (MAPCACHE_VIRT_END) -#define IOREMAP_VIRT_END (IOREMAP_VIRT_START + (4*1024*1024)) +#define MAX_DIRECTMAP_ADDRESS MAX_MEMORY_SIZE + + /* - * Amount of slack domain memory to leave in system, in megabytes. + * Amount of slack domain memory to leave in system, in kilobytes. * Prevents a hard out-of-memory crunch for thinsg like network receive. */ #define SLACK_DOMAIN_MEM_KILOBYTES 2048 + +/* + * These will probably change in the future.. + * locations for 32-bit guest compatibility mappings + */ + +/* 4M of 32-bit machine-physical shadow in low 4G of VM space */ +#define SHADOW_MPT32_VIRT_START (0xfc000000) +#define SHADOW_MPT32_VIRT_END (SHADOW_MPT32_VIRT_START + (4 * 1024 * 1024)) + +/* 44M of I/O remap for 32-bit drivers */ +#define IOREMAP_LOW_VIRT_START (SHADOW_MPT32_VIRT_END) +#define IOREMAP_LOW_VIRT_END (IOREMAP_LOW_VIRT_START + (44 * 1024 * 1024)) + +/* 4M of 32-bit page table */ +#define SHADOW_PT32_VIRT_START (IOREMAP_LOW_VIRT_END) +#define SHADOW_PT32_VIRT_END (SHADOW_PT32_VIRT_START + (4 * 1024 * 1024)) + + /* Linkage for x86 */ #define FASTCALL(x) x __attribute__((regparm(3))) #define asmlinkage __attribute__((regparm(0))) @@ -127,6 +266,8 @@ SYMBOL_NAME_LABEL(name) #endif +#define PGT_base_page_table PGT_l4_page_table + #define barrier() __asm__ __volatile__("": : :"memory") /* diff --git a/xen/include/asm-x86_64/current.h b/xen/include/asm-x86_64/current.h index dd288ca8b3..d5ffb0720a 100644 --- a/xen/include/asm-x86_64/current.h +++ b/xen/include/asm-x86_64/current.h @@ -26,7 +26,7 @@ static inline void set_current(struct task_struct *p) static inline execution_context_t *get_execution_context(void) { execution_context_t *execution_context; - __asm__( "andq %%rsp,%0; addl %2,%0" + __asm__( "andq %%rsp,%0; addq %2,%0" : "=r" (execution_context) : "0" (~(STACK_SIZE-1)), "i" (STACK_SIZE-STACK_RESERVED) ); return execution_context; @@ -42,7 +42,7 @@ static inline unsigned long get_stack_top(void) #define schedule_tail(_p) \ __asm__ __volatile__ ( \ - "andq %%rsp,%0; addq %2,%0; movl %0,%%rsp; jmp *%1" \ + "andq %%rsp,%0; addq %2,%0; movq %0,%%rsp; jmp *%1" \ : : "r" (~(STACK_SIZE-1)), \ "r" (unlikely(is_idle_task((_p))) ? \ continue_cpu_idle_loop : \ diff --git a/xen/include/asm-x86_64/desc.h b/xen/include/asm-x86_64/desc.h index ef864de036..e8556e976e 100644 --- a/xen/include/asm-x86_64/desc.h +++ b/xen/include/asm-x86_64/desc.h @@ -8,7 +8,7 @@ #define __FIRST_PER_CPU_ENTRY (FIRST_RESERVED_GDT_ENTRY + 8) #define __CPU_DESC_INDEX(x,field) \ - ((x) * sizeof(struct per_cpu_gdt) + offsetof(struct per_cpu_gdt, field) + (FIRST_PER_CPU_ENTRY*8)) + ((x) * sizeof(struct per_cpu_gdt) + offsetof(struct per_cpu_gdt, field) + (__FIRST_PER_CPU_ENTRY*8)) #define __LDT(n) (((n)<<1) + __FIRST_LDT_ENTRY) #define load_TR(cpu) asm volatile("ltr %w0"::"r" (__CPU_DESC_INDEX(cpu, tss))); diff --git a/xen/include/asm-x86_64/io.h b/xen/include/asm-x86_64/io.h index 21c22876a2..aa580dd71a 100644 --- a/xen/include/asm-x86_64/io.h +++ b/xen/include/asm-x86_64/io.h @@ -2,6 +2,7 @@ #define _ASM_IO_H #include +#include /* * This file contains the definitions for the x86 IO instructions @@ -139,9 +140,12 @@ extern inline void * phys_to_virt(unsigned long address) #ifdef CONFIG_DISCONTIGMEM #include #else -#define page_to_phys(page) (((page) - frame_table) << PAGE_SHIFT) +#define page_to_phys(page) (((page) - frame_table) << PAGE_SHIFT) #endif +#define page_to_pfn(page) ((unsigned long)((_page) - frame_table)) +#define page_to_virt(page) (phys_to_virt(page_to_phys(_page))) + extern void * __ioremap(unsigned long offset, unsigned long size, unsigned long flags); extern inline void * ioremap (unsigned long offset, unsigned long size) diff --git a/xen/include/asm-x86_64/ldt.h b/xen/include/asm-x86_64/ldt.h index 7a345ceb40..e0f139829e 100644 --- a/xen/include/asm-x86_64/ldt.h +++ b/xen/include/asm-x86_64/ldt.h @@ -9,7 +9,7 @@ static inline void load_LDT(struct task_struct *p) if ( (ents = p->mm.ldt_ents) == 0 ) { - __asm__ __volatile__ ( "lldt %%rax" : : "a" (0) ); + __asm__ __volatile__ ( "lldt %w0" : : "r" (0) ); } else { @@ -17,17 +17,17 @@ static inline void load_LDT(struct task_struct *p) struct ldttss_desc *desc; cpu = smp_processor_id(); - desc = (struct desc_struct *)((char *)GET_GDT_ADDRESS(p) + __CPU_DESC_INDEX(cpu, ldt)); + desc = (struct ldttss_desc *)((char *)GET_GDT_ADDRESS(p) + __CPU_DESC_INDEX(cpu, ldt)); desc->limit0 = ents*8-1; desc->base0 = LDT_VIRT_START&0xffff; desc->base1 = (LDT_VIRT_START&0xff0000)>>16; desc->type = DESC_LDT; desc->dpl = 0; desc->p = 1; - desc->limit = 0; + desc->limit1 = 0; desc->zero0 = 0; desc->g = 0; - desc->base2 = (LDT_VIRST_START&0xff000000)>>24; + desc->base2 = (LDT_VIRT_START&0xff000000)>>24; desc->base3 = LDT_VIRT_START>>32; desc->zero1 = 0; __load_LDT(cpu); diff --git a/xen/include/asm-x86_64/page.h b/xen/include/asm-x86_64/page.h index b016b635b2..d7cb5c6b3f 100644 --- a/xen/include/asm-x86_64/page.h +++ b/xen/include/asm-x86_64/page.h @@ -265,7 +265,6 @@ struct bug_frame { #ifndef CONFIG_DISCONTIGMEM #define virt_to_page(kaddr) (frame_table + (__pa(kaddr) >> PAGE_SHIFT)) #define pfn_to_page(pfn) (frame_table + (pfn)) -#define page_to_pfn(page) ((page) - frame_table) #define page_address(_p) (__va(((_p) - frame_table) << PAGE_SHIFT)) #define VALID_PAGE(page) (((page) - frame_table) < max_mapnr) #endif diff --git a/xen/include/asm-x86_64/pci.h b/xen/include/asm-x86_64/pci.h index 5c931adb36..ffb30f50ba 100644 --- a/xen/include/asm-x86_64/pci.h +++ b/xen/include/asm-x86_64/pci.h @@ -27,10 +27,8 @@ int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq); #include #include #include -/*#include */ #include #include -#include struct pci_dev; extern int force_mmu; @@ -96,14 +94,16 @@ static inline void pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction) { - BUG_ON(direction == PCI_DMA_NONE); + if (direction == PCI_DMA_NONE) + out_of_line_bug(); } static inline void pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction) { - BUG_ON(direction == PCI_DMA_NONE); + if (direction == PCI_DMA_NONE) + out_of_line_bug(); } /* The PCI address space does equal the physical memory @@ -162,6 +162,19 @@ static inline dma_addr_t pci_map_page(struct pci_dev *hwdev, struct pfn_info *pa #define BAD_DMA_ADDRESS (-1UL) + +/* Unmap a set of streaming mode DMA translations. + * Again, cpu read rules concerning calls here are the same as for + * pci_unmap_single() above. + */ +static inline void pci_unmap_sg(struct pci_dev *dev, struct scatterlist *sg, + int nents, int dir) +{ + if (dir == PCI_DMA_NONE) + out_of_line_bug(); +} + + /* Map a set of buffers described by scatterlist in streaming * mode for DMA. This is the scather-gather version of the * above pci_map_single interface. Here the scatter gather list @@ -181,18 +194,20 @@ static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) { int i; - - BUG_ON(direction == PCI_DMA_NONE); - + + if (direction == PCI_DMA_NONE) + out_of_line_bug(); + /* * temporary 2.4 hack */ for (i = 0; i < nents; i++ ) { struct scatterlist *s = &sg[i]; void *addr = s->address; - if (addr) - BUG_ON(s->page || s->offset); - else if (s->page) + if (addr) { + if (s->page || s->offset) + out_of_line_bug(); + } else if (s->page) addr = page_address(s->page) + s->offset; #if 0 /* Invalid check, since address==0 is valid. */ @@ -209,17 +224,6 @@ static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, pci_unmap_sg(hwdev, sg, i, direction); return 0; } - -/* Unmap a set of streaming mode DMA translations. - * Again, cpu read rules concerning calls here are the same as for - * pci_unmap_single() above. - */ -static inline void pci_unmap_sg(struct pci_dev *dev, struct scatterlist *sg, - int nents, int dir) -{ - if (direction == PCI_DMA_NONE) - out_of_line_bug(); -} /* Make physical memory consistent for a single @@ -259,11 +263,6 @@ static inline void pci_dma_sync_sg(struct pci_dev *hwdev, #endif -extern int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction); -extern void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction); - #define pci_unmap_page pci_unmap_single /* Return whether the given PCI device DMA address mask can @@ -297,6 +296,7 @@ pci_dac_page_to_dma(struct pci_dev *pdev, struct pfn_info *page, unsigned long o static __inline__ struct pfn_info * pci_dac_dma_to_page(struct pci_dev *pdev, dma64_addr_t dma_addr) { + unsigned long poff = (dma_addr >> PAGE_SHIFT); return frame_table + poff; } diff --git a/xen/include/asm-x86_64/processor.h b/xen/include/asm-x86_64/processor.h index 47d0f751e9..16d095f6c8 100644 --- a/xen/include/asm-x86_64/processor.h +++ b/xen/include/asm-x86_64/processor.h @@ -340,7 +340,8 @@ struct tss_struct { } __attribute__((packed)) ____cacheline_aligned; struct thread_struct { - unsigned long rsp0; + unsigned long guestos_sp; + unsigned long guestos_ss; unsigned long rip; unsigned long rsp; unsigned long userrsp; /* Copy from PDA */ diff --git a/xen/include/asm-x86_64/types.h b/xen/include/asm-x86_64/types.h index 1b865c48fc..ea7aac72f7 100644 --- a/xen/include/asm-x86_64/types.h +++ b/xen/include/asm-x86_64/types.h @@ -3,6 +3,8 @@ typedef unsigned short umode_t; +typedef unsigned long size_t; + /* * __xx is ok: it doesn't pollute the POSIX namespace. Use these in the * header files exported to user space diff --git a/xen/include/asm-x86_64/uaccess.h b/xen/include/asm-x86_64/uaccess.h index 56a7a42aeb..49a5db93c2 100644 --- a/xen/include/asm-x86_64/uaccess.h +++ b/xen/include/asm-x86_64/uaccess.h @@ -103,11 +103,11 @@ extern void __get_user_8(void); ({ long __val_gu; \ int __ret_gu=1; \ switch(sizeof (*(ptr))) { \ - case 1: _ret_gu=copy_from_user(&__val_gu,ptr,1);break; \ - case 2: _ret_gu=copy_from_user(&__val_gu,ptr,2);break; \ - case 4: _ret_gu=copy_from_user(&__val_gu,ptr,4);break; \ - case 8: _ret_gu=copy_from_user(&__val_gu,ptr,8);break; \ - default: _ret_gu=copy_from_user(&__val_gu,ptr,sizeof(*(ptr)));break;\ ++ case 1: __ret_gu=copy_from_user(&__val_gu,ptr,1);break; \ ++ case 2: __ret_gu=copy_from_user(&__val_gu,ptr,2);break; \ ++ case 4: __ret_gu=copy_from_user(&__val_gu,ptr,4);break; \ ++ case 8: __ret_gu=copy_from_user(&__val_gu,ptr,8);break; \ ++ default: __ret_gu=copy_from_user(&__val_gu,ptr,sizeof(*(ptr)));break;\ /*case 1: __get_user_x(1,__ret_gu,__val_gu,ptr); break;*/ \ /*case 2: __get_user_x(2,__ret_gu,__val_gu,ptr); break;*/ \ /*case 4: __get_user_x(4,__ret_gu,__val_gu,ptr); break;*/ \ diff --git a/xen/include/hypervisor-ifs/arch-i386/hypervisor-if.h b/xen/include/hypervisor-ifs/arch-i386/hypervisor-if.h index e3b32cb4b5..a06020cd96 100644 --- a/xen/include/hypervisor-ifs/arch-i386/hypervisor-if.h +++ b/xen/include/hypervisor-ifs/arch-i386/hypervisor-if.h @@ -35,6 +35,11 @@ #define FLAT_RING3_CS 0x082b /* GDT index 261 */ #define FLAT_RING3_DS 0x0833 /* GDT index 262 */ +#define FLAT_GUESTOS_CS FLAT_RING1_CS +#define FLAT_GUESTOS_DS FLAT_RING1_DS +#define FLAT_USER_CS FLAT_RING3_CS +#define FLAT_USER_DS FLAT_RING3_DS + /* And the trap vector is... */ #define TRAP_INSTR "int $0x82" @@ -99,13 +104,13 @@ typedef struct full_execution_context_st { #define ECF_I387_VALID (1<<0) unsigned long flags; - execution_context_t i386_ctxt; /* User-level CPU registers */ - char i387_ctxt[256]; /* User-level FPU registers */ + execution_context_t cpu_ctxt; /* User-level CPU registers */ + char fpu_ctxt[256]; /* User-level FPU registers */ trap_info_t trap_ctxt[256]; /* Virtual IDT */ unsigned int fast_trap_idx; /* "Fast trap" vector offset */ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ - unsigned long ring1_ss, ring1_esp; /* Virtual TSS (only SS1/ESP1) */ + unsigned long guestos_ss, guestos_esp; /* Virtual TSS (only SS1/ESP1) */ unsigned long pt_base; /* CR3 (pagetable base) */ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ unsigned long event_callback_cs; /* CS:EIP of event callback */ @@ -114,6 +119,8 @@ typedef struct full_execution_context_st unsigned long failsafe_callback_eip; } full_execution_context_t; +#define ARCH_HAS_FAST_TRAP + #endif #endif diff --git a/xen/include/hypervisor-ifs/arch-x86_64/hypervisor-if.h b/xen/include/hypervisor-ifs/arch-x86_64/hypervisor-if.h index 0a1101b314..94c031120a 100644 --- a/xen/include/hypervisor-ifs/arch-x86_64/hypervisor-if.h +++ b/xen/include/hypervisor-ifs/arch-x86_64/hypervisor-if.h @@ -35,13 +35,20 @@ #define FLAT_RING3_CS64 0x082b /* GDT index 261 */ #define FLAT_RING3_DS 0x0833 /* GDT index 262 */ +#define FLAT_GUESTOS_DS FLAT_RING3_DS +#define FLAT_GUESTOS_CS FLAT_RING3_CS64 +#define FLAT_GUESTOS_CS32 FLAT_RING3_CS32 + +#define FLAT_USER_DS FLAT_RING3_DS +#define FLAT_USER_CS FLAT_RING3_CS64 +#define FLAT_USER_CS32 FLAT_RING3_CS32 /* And the trap vector is... */ #define TRAP_INSTR "syscall" #ifndef machine_to_phys_mapping -#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) +#define machine_to_phys_mapping ((unsigned long *)0xffff810000000000ULL) #endif #ifndef __ASSEMBLY__ @@ -98,13 +105,12 @@ typedef struct full_execution_context_st { #define ECF_I387_VALID (1<<0) unsigned long flags; - execution_context_t x86_64_ctxt; /* User-level CPU registers */ - char i387_ctxt[512]; /* User-level FPU registers */ + execution_context_t cpu_ctxt; /* User-level CPU registers */ + char fpu_ctxt[512]; /* User-level FPU registers */ trap_info_t trap_ctxt[256]; /* Virtual IDT */ - unsigned int fast_trap_idx; /* "Fast trap" vector offset */ unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */ unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */ - unsigned long ring1_ss, ring1_esp; /* Virtual TSS (only SS1/ESP1) */ + unsigned long guestos_ss, guestos_esp; /* Virtual TSS (only SS1/ESP1) */ unsigned long pt_base; /* CR3 (pagetable base) */ unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */ unsigned long event_callback_cs; /* CS:EIP of event callback */ diff --git a/xen/include/hypervisor-ifs/dom0_ops.h b/xen/include/hypervisor-ifs/dom0_ops.h index 1345293284..8e84f38336 100644 --- a/xen/include/hypervisor-ifs/dom0_ops.h +++ b/xen/include/hypervisor-ifs/dom0_ops.h @@ -116,10 +116,11 @@ typedef struct dom0_getdomaininfo_st typedef struct dom0_getpageframeinfo_st { /* IN variables. */ - unsigned long pfn; /* Machine page frame number to query. */ + unsigned long pfn; /* Machine page frame number to query. */ domid_t domain; /* To which domain does the frame belong? */ /* OUT variables. */ - enum { NONE, L1TAB, L2TAB } type; /* Is the page PINNED to a type? */ + /* Is the page PINNED to a type? */ + enum { NONE, L1TAB, L2TAB, L3TAB, L4TAB } type; } dom0_getpageframeinfo_t; #define DOM0_IOPL 14 diff --git a/xen/include/xeno/lib.h b/xen/include/xeno/lib.h index c3fe557974..f29964f431 100644 --- a/xen/include/xeno/lib.h +++ b/xen/include/xeno/lib.h @@ -51,6 +51,6 @@ unsigned long simple_strtoul(const char *cp,char **endp,unsigned int base); long long simple_strtoll(const char *cp,char **endp,unsigned int base); /* Produce a 32-bit hash from a key string 'k' of length 'len' bytes. */ -unsigned long hash(unsigned char *k, unsigned long len); +u32 hash(unsigned char *k, unsigned long len); #endif /* __LIB_H__ */ diff --git a/xen/include/xeno/types.h b/xen/include/xeno/types.h index c5f8d5586d..0299f74136 100644 --- a/xen/include/xeno/types.h +++ b/xen/include/xeno/types.h @@ -14,8 +14,6 @@ #define LONG_MIN (-LONG_MAX - 1) #define ULONG_MAX (~0UL) -typedef unsigned int size_t; - /* bsd */ typedef unsigned char u_char; typedef unsigned short u_short; diff --git a/xen/net/dev.c b/xen/net/dev.c index fbd9be63c9..1bd9120672 100644 --- a/xen/net/dev.c +++ b/xen/net/dev.c @@ -627,7 +627,7 @@ static void net_rx_action(struct softirq_action *h) * for ethernet header, plus any other alignment padding added by the * driver. */ - offset = (int)skb->data & ~PAGE_MASK; + offset = (int)(long)skb->data & ~PAGE_MASK; skb->head = (u8 *)map_domain_mem(((skb->pf - frame_table) << PAGE_SHIFT)); skb->data = skb->nh.raw = skb->head + offset; diff --git a/xenolinux-2.4.25-sparse/arch/xeno/Makefile b/xenolinux-2.4.25-sparse/arch/xeno/Makefile index 6a1e501b59..a2e6a0e0cd 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xeno/Makefile @@ -51,20 +51,21 @@ HEAD := arch/xeno/kernel/head.o arch/xeno/kernel/init_task.o SUBDIRS += arch/xeno/kernel arch/xeno/mm arch/xeno/lib SUBDIRS += arch/xeno/drivers/console arch/xeno/drivers/network SUBDIRS += arch/xeno/drivers/evtchn arch/xeno/drivers/block -SUBDIRS += arch/xeno/drivers/balloon +SUBDIRS += arch/xeno/drivers/balloon arch/xeno/drivers/vnetif ifdef CONFIG_XENO_PRIV SUBDIRS += arch/xeno/drivers/dom0 endif CORE_FILES += arch/xeno/kernel/kernel.o arch/xeno/mm/mm.o -CORE_FILES += arch/xeno/drivers/evtchn/evtchn.o -CORE_FILES += arch/xeno/drivers/console/con.o -CORE_FILES += arch/xeno/drivers/block/blk.o -CORE_FILES += arch/xeno/drivers/network/net.o +CORE_FILES += arch/xeno/drivers/evtchn/drv.o +CORE_FILES += arch/xeno/drivers/console/drv.o +CORE_FILES += arch/xeno/drivers/block/drv.o +CORE_FILES += arch/xeno/drivers/network/drv.o +CORE_FILES += arch/xeno/drivers/vnetif/drv.o ifdef CONFIG_XENO_PRIV -CORE_FILES += arch/xeno/drivers/dom0/dom0.o +CORE_FILES += arch/xeno/drivers/dom0/drv.o endif -CORE_FILES += arch/xeno/drivers/balloon/balloon_driver.o +CORE_FILES += arch/xeno/drivers/balloon/drv.o LIBS := $(TOPDIR)/arch/xeno/lib/lib.a $(LIBS) $(TOPDIR)/arch/xeno/lib/lib.a arch/xeno/kernel: dummy diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/balloon/Makefile b/xenolinux-2.4.25-sparse/arch/xeno/drivers/balloon/Makefile index f780a515e0..9fb2227978 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/balloon/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/balloon/Makefile @@ -1,3 +1,3 @@ -O_TARGET := balloon_driver.o +O_TARGET := drv.o obj-y := balloon.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/Makefile b/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/Makefile index 7c87e099c7..35986ca54a 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/Makefile @@ -1,3 +1,3 @@ -O_TARGET := blk.o -obj-y := xl_block.o xl_vbd.o +O_TARGET := drv.o +obj-y := block.o vbd.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.c b/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.c similarity index 98% rename from xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.c rename to xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.c index dac8c2665f..c01a44d8f1 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.c +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.c @@ -1,5 +1,5 @@ /****************************************************************************** - * xl_block.c + * block.c * * Xenolinux virtual block-device driver. * @@ -7,7 +7,7 @@ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge */ -#include "xl_block.h" +#include "block.h" #include #include #include @@ -219,11 +219,11 @@ int xenolinux_block_ioctl(struct inode *inode, struct file *filep, return 0; case SCSI_IOCTL_GET_BUS_NUMBER: - DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in xl_block"); + DPRINTK("FIXME: SCSI_IOCTL_GET_BUS_NUMBER ioctl in Xen blkdev"); return -ENOSYS; default: - printk(KERN_ALERT "ioctl %08x not supported by xl_block\n", command); + printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", command); return -ENOSYS; } diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.h b/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.h similarity index 94% rename from xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.h rename to xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.h index c735a6ec44..ef8c241387 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_block.h +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/block.h @@ -1,11 +1,11 @@ /****************************************************************************** - * xl_block.h + * block.h * * Shared definitions between all levels of XenoLinux Virtual block devices. */ -#ifndef __XL_BLOCK_H__ -#define __XL_BLOCK_H__ +#ifndef __XENO_DRIVERS_BLOCK_H__ +#define __XENO_DRIVERS_BLOCK_H__ #include #include @@ -79,4 +79,4 @@ static inline xl_disk_t *xldev_to_xldisk(kdev_t xldev) extern int xlvbd_init(void); extern void xlvbd_cleanup(void); -#endif /* __XL_BLOCK_H__ */ +#endif /* __XENO_DRIVERS_BLOCK_H__ */ diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_vbd.c b/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/vbd.c similarity index 99% rename from xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_vbd.c rename to xenolinux-2.4.25-sparse/arch/xeno/drivers/block/vbd.c index cf39c5d71a..e3473dab28 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/xl_vbd.c +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/block/vbd.c @@ -1,5 +1,5 @@ /****************************************************************************** - * xl_vbd.c + * vbd.c * * Xenolinux virtual block-device driver (xvd). * @@ -7,7 +7,7 @@ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge */ -#include "xl_block.h" +#include "block.h" #include /* diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/console/Makefile b/xenolinux-2.4.25-sparse/arch/xeno/drivers/console/Makefile index 546180a3c2..aaa546a8f3 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/console/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/console/Makefile @@ -1,3 +1,3 @@ -O_TARGET := con.o +O_TARGET := drv.o obj-$(CONFIG_XEN_CONSOLE) := console.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/Makefile b/xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/Makefile index 9030801f14..3e2e17bd23 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/Makefile @@ -1,3 +1,3 @@ -O_TARGET := dom0.o -obj-y := dom0_core.o vfr.o +O_TARGET := drv.o +obj-y := core.o vfr.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/dom0_core.c b/xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/core.c similarity index 94% rename from xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/dom0_core.c rename to xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/core.c index 08144d9678..c7f1fd496b 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/dom0_core.c +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/dom0/core.c @@ -1,9 +1,9 @@ /****************************************************************************** - * dom0_core.c + * core.c * * Interface to privileged domain-0 commands. * - * Copyright (c) 2002-2003, K A Fraser, B Dragovic + * Copyright (c) 2002-2004, K A Fraser, B Dragovic */ #include @@ -31,11 +31,8 @@ #include #include -#include "../block/xl_block.h" - static struct proc_dir_entry *privcmd_intf; - static int privcmd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long data) { @@ -83,7 +80,6 @@ static int __init init_module(void) if ( !(start_info.flags & SIF_PRIVILEGED) ) return 0; - /* xeno control interface */ privcmd_intf = create_xeno_proc_entry("privcmd", 0400); if ( privcmd_intf != NULL ) { diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/Makefile b/xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/Makefile index 8384c8658b..61c983f625 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/Makefile @@ -1,3 +1,3 @@ -O_TARGET := evtchn.o -obj-y := xl_evtchn.o +O_TARGET := drv.o +obj-y := evtchn.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/xl_evtchn.c b/xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/evtchn.c similarity index 99% rename from xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/xl_evtchn.c rename to xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/evtchn.c index f642d082cd..a7978ee8d2 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/xl_evtchn.c +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/evtchn/evtchn.c @@ -1,5 +1,5 @@ /****************************************************************************** - * xl_evtchn.c + * evtchn.c * * Xenolinux driver for receiving and demuxing event-channel signals. * diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/network/Makefile b/xenolinux-2.4.25-sparse/arch/xeno/drivers/network/Makefile index b44a288a5b..2e4c1f4825 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/network/Makefile +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/network/Makefile @@ -1,3 +1,3 @@ -O_TARGET := net.o +O_TARGET := drv.o obj-y := network.o include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/network/network.c b/xenolinux-2.4.25-sparse/arch/xeno/drivers/network/network.c index 512f6530a5..0a1bce2bfa 100644 --- a/xenolinux-2.4.25-sparse/arch/xeno/drivers/network/network.c +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/network/network.c @@ -415,7 +415,7 @@ static void network_interrupt(int irq, void *unused, struct pt_regs *ptregs) } -int network_close(struct net_device *dev) +static int network_close(struct net_device *dev) { struct net_private *np = dev->priv; netop_t netop; diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/Makefile b/xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/Makefile new file mode 100644 index 0000000000..304c2e78ef --- /dev/null +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/Makefile @@ -0,0 +1,3 @@ +O_TARGET := drv.o +obj-y := vnetif.o +include $(TOPDIR)/Rules.make diff --git a/xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/vnetif.c b/xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/vnetif.c new file mode 100644 index 0000000000..465dd18233 --- /dev/null +++ b/xenolinux-2.4.25-sparse/arch/xeno/drivers/vnetif/vnetif.c @@ -0,0 +1,552 @@ +/****************************************************************************** + * vnetif.c + * + * Virtual network driver for XenoLinux. + * + * Copyright (c) 2002-2004, K A Fraser + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define RX_BUF_SIZE ((PAGE_SIZE/2)+1) /* Fool the slab allocator :-) */ + +static void network_interrupt(int irq, void *dev_id, struct pt_regs *ptregs); +static void network_tx_buf_gc(struct net_device *dev); +static void network_alloc_rx_buffers(struct net_device *dev); +static void cleanup_module(void); + +static struct list_head dev_list; + +struct net_private +{ + struct list_head list; + struct net_device *dev; + + struct net_device_stats stats; + NET_RING_IDX rx_resp_cons, tx_resp_cons; + unsigned int net_ring_fixmap_idx, tx_full; + net_ring_t *net_ring; + net_idx_t *net_idx; + spinlock_t tx_lock; + unsigned int idx; /* Domain-specific index of this VIF. */ + + unsigned int rx_bufs_to_notify; + +#define STATE_ACTIVE 0 +#define STATE_SUSPENDED 1 +#define STATE_CLOSED 2 + unsigned int state; + + /* + * {tx,rx}_skbs store outstanding skbuffs. The first entry in each + * array is an index into a chain of free entries. + */ + struct sk_buff *tx_skbs[TX_RING_SIZE+1]; + struct sk_buff *rx_skbs[RX_RING_SIZE+1]; +}; + +/* Access macros for acquiring freeing slots in {tx,rx}_skbs[]. */ +#define ADD_ID_TO_FREELIST(_list, _id) \ + (_list)[(_id)] = (_list)[0]; \ + (_list)[0] = (void *)(unsigned long)(_id); +#define GET_ID_FROM_FREELIST(_list) \ + ({ unsigned long _id = (unsigned long)(_list)[0]; \ + (_list)[0] = (_list)[_id]; \ + (unsigned short)_id; }) + + +static void _dbg_network_int(struct net_device *dev) +{ + struct net_private *np = dev->priv; + + if ( np->state == STATE_CLOSED ) + return; + + printk(KERN_ALERT "net: tx_full=%d, tx_resp_cons=0x%08x," + " tx_req_prod=0x%08x\nnet: tx_resp_prod=0x%08x," + " tx_event=0x%08x, state=%d\n", + np->tx_full, np->tx_resp_cons, + np->net_idx->tx_req_prod, np->net_idx->tx_resp_prod, + np->net_idx->tx_event, + test_bit(__LINK_STATE_XOFF, &dev->state)); + printk(KERN_ALERT "net: rx_resp_cons=0x%08x," + " rx_req_prod=0x%08x\nnet: rx_resp_prod=0x%08x, rx_event=0x%08x\n", + np->rx_resp_cons, np->net_idx->rx_req_prod, + np->net_idx->rx_resp_prod, np->net_idx->rx_event); +} + + +static void dbg_network_int(int irq, void *unused, struct pt_regs *ptregs) +{ + struct list_head *ent; + struct net_private *np; + list_for_each ( ent, &dev_list ) + { + np = list_entry(ent, struct net_private, list); + _dbg_network_int(np->dev); + } +} + + +static int network_open(struct net_device *dev) +{ + struct net_private *np = dev->priv; + netop_t netop; + int i, ret; + + netop.cmd = NETOP_RESET_RINGS; + netop.vif = np->idx; + if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 ) + { + printk(KERN_ALERT "Possible net trouble: couldn't reset ring idxs\n"); + return ret; + } + + netop.cmd = NETOP_GET_VIF_INFO; + netop.vif = np->idx; + if ( (ret = HYPERVISOR_net_io_op(&netop)) != 0 ) + { + printk(KERN_ALERT "Couldn't get info for vif %d\n", np->idx); + return ret; + } + + memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN); + + set_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx, + netop.u.get_vif_info.ring_mfn << PAGE_SHIFT); + np->net_ring = (net_ring_t *)fix_to_virt( + FIX_NETRING0_BASE + np->net_ring_fixmap_idx); + np->net_idx = &HYPERVISOR_shared_info->net_idx[np->idx]; + + np->rx_bufs_to_notify = 0; + np->rx_resp_cons = np->tx_resp_cons = np->tx_full = 0; + memset(&np->stats, 0, sizeof(np->stats)); + spin_lock_init(&np->tx_lock); + memset(np->net_ring, 0, sizeof(*np->net_ring)); + memset(np->net_idx, 0, sizeof(*np->net_idx)); + + /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ + for ( i = 0; i <= TX_RING_SIZE; i++ ) + np->tx_skbs[i] = (void *)(i+1); + for ( i = 0; i <= RX_RING_SIZE; i++ ) + np->rx_skbs[i] = (void *)(i+1); + + wmb(); + np->state = STATE_ACTIVE; + + network_alloc_rx_buffers(dev); + + netif_start_queue(dev); + + MOD_INC_USE_COUNT; + + return 0; +} + + +static void network_tx_buf_gc(struct net_device *dev) +{ + NET_RING_IDX i, prod; + unsigned short id; + struct net_private *np = dev->priv; + struct sk_buff *skb; + tx_entry_t *tx_ring = np->net_ring->tx_ring; + + do { + prod = np->net_idx->tx_resp_prod; + + for ( i = np->tx_resp_cons; i != prod; i++ ) + { + id = tx_ring[MASK_NET_TX_IDX(i)].resp.id; + skb = np->tx_skbs[id]; + ADD_ID_TO_FREELIST(np->tx_skbs, id); + dev_kfree_skb_any(skb); + } + + np->tx_resp_cons = prod; + + /* + * Set a new event, then check for race with update of tx_cons. Note + * that it is essential to schedule a callback, no matter how few + * buffers are pending. Even if there is space in the transmit ring, + * higher layers may be blocked because too much data is outstanding: + * in such cases notification from Xen is likely to be the only kick + * that we'll get. + */ + np->net_idx->tx_event = + prod + ((np->net_idx->tx_req_prod - prod) >> 1) + 1; + mb(); + } + while ( prod != np->net_idx->tx_resp_prod ); + + if ( np->tx_full && ((np->net_idx->tx_req_prod - prod) < TX_RING_SIZE) ) + { + np->tx_full = 0; + if ( np->state == STATE_ACTIVE ) + netif_wake_queue(dev); + } +} + + +static inline pte_t *get_ppte(void *addr) +{ + pgd_t *pgd; pmd_t *pmd; pte_t *pte; + pgd = pgd_offset_k( (unsigned long)addr); + pmd = pmd_offset(pgd, (unsigned long)addr); + pte = pte_offset(pmd, (unsigned long)addr); + return pte; +} + + +static void network_alloc_rx_buffers(struct net_device *dev) +{ + unsigned short id; + struct net_private *np = dev->priv; + struct sk_buff *skb; + netop_t netop; + NET_RING_IDX i = np->net_idx->rx_req_prod; + + if ( unlikely((i - np->rx_resp_cons) == RX_RING_SIZE) || + unlikely(np->state != STATE_ACTIVE) ) + return; + + do { + skb = dev_alloc_skb(RX_BUF_SIZE); + if ( unlikely(skb == NULL) ) + break; + + skb->dev = dev; + + if ( unlikely(((unsigned long)skb->head & (PAGE_SIZE-1)) != 0) ) + panic("alloc_skb needs to provide us page-aligned buffers."); + + id = GET_ID_FROM_FREELIST(np->rx_skbs); + np->rx_skbs[id] = skb; + + np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.id = id; + np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = + virt_to_machine(get_ppte(skb->head)); + + np->rx_bufs_to_notify++; + } + while ( (++i - np->rx_resp_cons) != RX_RING_SIZE ); + + /* + * We may have allocated buffers which have entries outstanding in the page + * update queue -- make sure we flush those first! + */ + flush_page_update_queue(); + + np->net_idx->rx_req_prod = i; + np->net_idx->rx_event = np->rx_resp_cons + 1; + + /* Batch Xen notifications. */ + if ( np->rx_bufs_to_notify > (RX_RING_SIZE/4) ) + { + netop.cmd = NETOP_PUSH_BUFFERS; + netop.vif = np->idx; + (void)HYPERVISOR_net_io_op(&netop); + np->rx_bufs_to_notify = 0; + } +} + + +static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned short id; + struct net_private *np = (struct net_private *)dev->priv; + tx_req_entry_t *tx; + netop_t netop; + NET_RING_IDX i; + + if ( unlikely(np->tx_full) ) + { + printk(KERN_ALERT "%s: full queue wasn't stopped!\n", dev->name); + netif_stop_queue(dev); + return -ENOBUFS; + } + + if ( unlikely((((unsigned long)skb->data & ~PAGE_MASK) + skb->len) >= + PAGE_SIZE) ) + { + struct sk_buff *new_skb = dev_alloc_skb(RX_BUF_SIZE); + if ( unlikely(new_skb == NULL) ) + return 1; + skb_put(new_skb, skb->len); + memcpy(new_skb->data, skb->data, skb->len); + dev_kfree_skb(skb); + skb = new_skb; + } + + spin_lock_irq(&np->tx_lock); + + i = np->net_idx->tx_req_prod; + + id = GET_ID_FROM_FREELIST(np->tx_skbs); + np->tx_skbs[id] = skb; + + tx = &np->net_ring->tx_ring[MASK_NET_TX_IDX(i)].req; + + tx->id = id; + tx->addr = phys_to_machine(virt_to_phys(skb->data)); + tx->size = skb->len; + + wmb(); + np->net_idx->tx_req_prod = i + 1; + + network_tx_buf_gc(dev); + + if ( (i - np->tx_resp_cons) == (TX_RING_SIZE - 1) ) + { + np->tx_full = 1; + netif_stop_queue(dev); + } + + spin_unlock_irq(&np->tx_lock); + + np->stats.tx_bytes += skb->len; + np->stats.tx_packets++; + + /* Only notify Xen if there are no outstanding responses. */ + mb(); + if ( np->net_idx->tx_resp_prod == i ) + { + netop.cmd = NETOP_PUSH_BUFFERS; + netop.vif = np->idx; + (void)HYPERVISOR_net_io_op(&netop); + } + + return 0; +} + + +static inline void _network_interrupt(struct net_device *dev) +{ + struct net_private *np = dev->priv; + unsigned long flags; + struct sk_buff *skb; + rx_resp_entry_t *rx; + NET_RING_IDX i; + + if ( unlikely(np->state == STATE_CLOSED) ) + return; + + spin_lock_irqsave(&np->tx_lock, flags); + network_tx_buf_gc(dev); + spin_unlock_irqrestore(&np->tx_lock, flags); + + again: + for ( i = np->rx_resp_cons; i != np->net_idx->rx_resp_prod; i++ ) + { + rx = &np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].resp; + + skb = np->rx_skbs[rx->id]; + ADD_ID_TO_FREELIST(np->rx_skbs, rx->id); + + if ( unlikely(rx->status != RING_STATUS_OK) ) + { + /* Gate this error. We get a (valid) slew of them on suspend. */ + if ( np->state == STATE_ACTIVE ) + printk(KERN_ALERT "bad buffer on RX ring!(%d)\n", rx->status); + dev_kfree_skb_any(skb); + continue; + } + + /* + * Set up shinfo -- from alloc_skb This was particularily nasty: the + * shared info is hidden at the back of the data area (presumably so it + * can be shared), but on page flip it gets very spunked. + */ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; + + phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] = + (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT; + + skb->data = skb->tail = skb->head + rx->offset; + skb_put(skb, rx->size); + skb->protocol = eth_type_trans(skb, dev); + + np->stats.rx_packets++; + + np->stats.rx_bytes += rx->size; + netif_rx(skb); + dev->last_rx = jiffies; + } + + np->rx_resp_cons = i; + + network_alloc_rx_buffers(dev); + + /* Deal with hypervisor racing our resetting of rx_event. */ + mb(); + if ( np->net_idx->rx_resp_prod != i ) + goto again; +} + + +static void network_interrupt(int irq, void *unused, struct pt_regs *ptregs) +{ + struct list_head *ent; + struct net_private *np; + list_for_each ( ent, &dev_list ) + { + np = list_entry(ent, struct net_private, list); + _network_interrupt(np->dev); + } +} + + +static int network_close(struct net_device *dev) +{ + struct net_private *np = dev->priv; + netop_t netop; + + np->state = STATE_SUSPENDED; + wmb(); + + netif_stop_queue(np->dev); + + netop.cmd = NETOP_FLUSH_BUFFERS; + netop.vif = np->idx; + (void)HYPERVISOR_net_io_op(&netop); + + while ( (np->rx_resp_cons != np->net_idx->rx_req_prod) || + (np->tx_resp_cons != np->net_idx->tx_req_prod) ) + { + barrier(); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + + wmb(); + np->state = STATE_CLOSED; + wmb(); + + /* Now no longer safe to take interrupts for this device. */ + clear_fixmap(FIX_NETRING0_BASE + np->net_ring_fixmap_idx); + + MOD_DEC_USE_COUNT; + + return 0; +} + + +static struct net_device_stats *network_get_stats(struct net_device *dev) +{ + struct net_private *np = (struct net_private *)dev->priv; + return &np->stats; +} + + +static int __init init_module(void) +{ +#if 0 + int i, fixmap_idx=-1, err; + struct net_device *dev; + struct net_private *np; + netop_t netop; + + INIT_LIST_HEAD(&dev_list); + + err = request_irq(HYPEREVENT_IRQ(_EVENT_NET), network_interrupt, + SA_SAMPLE_RANDOM, "network", NULL); + if ( err ) + { + printk(KERN_WARNING "Could not allocate network interrupt\n"); + goto fail; + } + + err = request_irq(HYPEREVENT_IRQ(_EVENT_DEBUG), dbg_network_int, + SA_SHIRQ, "net_dbg", &dbg_network_int); + if ( err ) + printk(KERN_WARNING "Non-fatal error -- no debug interrupt\n"); + + for ( i = 0; i < MAX_DOMAIN_VIFS; i++ ) + { + /* If the VIF is invalid then the query hypercall will fail. */ + netop.cmd = NETOP_GET_VIF_INFO; + netop.vif = i; + if ( HYPERVISOR_net_io_op(&netop) != 0 ) + continue; + + /* We actually only support up to 4 vifs right now. */ + if ( ++fixmap_idx == 4 ) + break; + + dev = alloc_etherdev(sizeof(struct net_private)); + if ( dev == NULL ) + { + err = -ENOMEM; + goto fail; + } + + np = dev->priv; + np->state = STATE_CLOSED; + np->net_ring_fixmap_idx = fixmap_idx; + np->idx = i; + + SET_MODULE_OWNER(dev); + dev->open = network_open; + dev->hard_start_xmit = network_start_xmit; + dev->stop = network_close; + dev->get_stats = network_get_stats; + + memcpy(dev->dev_addr, netop.u.get_vif_info.vmac, ETH_ALEN); + + if ( (err = register_netdev(dev)) != 0 ) + { + kfree(dev); + goto fail; + } + + np->dev = dev; + list_add(&np->list, &dev_list); + } + + return 0; + + fail: + cleanup_module(); + return err; +#endif + return 0; +} + + +static void cleanup_module(void) +{ + struct net_private *np; + struct net_device *dev; + + while ( !list_empty(&dev_list) ) + { + np = list_entry(dev_list.next, struct net_private, list); + list_del(&np->list); + dev = np->dev; + unregister_netdev(dev); + kfree(dev); + } +} + + +module_init(init_module); +module_exit(cleanup_module); -- 2.30.2